1 Introduction

This tutorial is a replicate of kaggle kernel writen by Heads or Tails. The goal of this EDA is to predict the statistics of visotors in two touristics areas in Japon. The challenge was published by kaggle.

1.1 Load librairies

#library(readr)
library(tibble)
library('stringr') # string manipulation
library('data.table') # data manipulation
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
## 
##     between, first, last
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:data.table':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday,
##     week, yday, year
## The following object is masked from 'package:base':
## 
##     date
library(ggplot2)
library(grid)
library(leaflet)
library(tidyr)
library(ggExtra)
library(ggridges)
## 
## Attaching package: 'ggridges'
## The following object is masked from 'package:ggplot2':
## 
##     scale_discrete_manual

1.2 Load data

air_visit <- as_tibble(fread(str_c("air_visit_data.csv")))
air_reserve <- as_tibble(fread(str_c('air_reserve.csv'))) 
air_store_info <- as_tibble(fread(str_c('air_store_info.csv')))
date_info <- as_tibble(fread(str_c('date_info.csv')))
hpg_reserve <- as_tibble(fread(str_c('hpg_reserve.csv')))
hpg_store_info <- as_tibble((fread(str_c('hpg_store_info.csv'))))
sample_submission <- as_tibble(fread(str_c('sample_submission.csv')))
store_id_relation <- as_tibble(fread(str_c('store_id_relation.csv')))

1.3 multiplot function

# Define multiple plot function
#
# ggplot objects can be passed in ..., or to plotlist (as a list of ggplot objects)
# - cols:   Number of columns in layout
# - layout: A matrix specifying the layout. If present, 'cols' is ignored.
#
# If the layout is something like matrix(c(1,2,3,3), nrow=2, byrow=TRUE),
# then plot 1 will go in the upper left, 2 will go in the upper right, and
# 3 will go all the way across the bottom.
#
multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {

  # Make a list from the ... arguments and plotlist
  plots <- c(list(...), plotlist)

  numPlots = length(plots)

  # If layout is NULL, then use 'cols' to determine layout
  if (is.null(layout)) {
    # Make the panel
    # ncol: Number of columns of plots
    # nrow: Number of rows needed, calculated from # of cols
    layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
                    ncol = cols, nrow = ceiling(numPlots/cols))
  }

 if (numPlots==1) {
    print(plots[[1]])

  } else {
    # Set up the page
    grid.newpage()
    pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))

    # Make each plot, in the correct location
    for (i in 1:numPlots) {
      # Get the i,j matrix positions of the regions that contain this subplot
      matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))

      print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
                                      layout.pos.col = matchidx$col))
    }
  }
}

2 Overview: File structure and content

2.1 Air visits

summary(air_visit)
##  air_store_id        visit_date           visitors     
##  Length:252108      Length:252108      Min.   :  1.00  
##  Class :character   Class :character   1st Qu.:  9.00  
##  Mode  :character   Mode  :character   Median : 17.00  
##                                        Mean   : 20.97  
##                                        3rd Qu.: 29.00  
##                                        Max.   :877.00
tibble::glimpse(air_visit)
## Observations: 252,108
## Variables: 3
## $ air_store_id <chr> "air_ba937bf13d40fb24", "air_ba937bf13d40fb24", "ai…
## $ visit_date   <chr> "2016-01-13", "2016-01-14", "2016-01-15", "2016-01-…
## $ visitors     <int> 25, 32, 29, 22, 6, 9, 31, 21, 18, 26, 21, 11, 24, 2…

The tibble shows the number of visitors by store for each date.

air_visit %>% dplyr::distinct(air_store_id) %>% nrow()
## [1] 829
air_visit %>% dplyr::distinct(visit_date) %>% nrow()
## [1] 478

There is 829 different store and 478 dates

2.2 Air Reserve

summary(air_reserve)
##  air_store_id       visit_datetime     reserve_datetime  
##  Length:92378       Length:92378       Length:92378      
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##  reserve_visitors 
##  Min.   :  1.000  
##  1st Qu.:  2.000  
##  Median :  3.000  
##  Mean   :  4.482  
##  3rd Qu.:  5.000  
##  Max.   :100.000
glimpse(air_reserve)
## Observations: 92,378
## Variables: 4
## $ air_store_id     <chr> "air_877f79706adbfb06", "air_db4b38ebe7a7ceff",…
## $ visit_datetime   <chr> "2016-01-01 19:00:00", "2016-01-01 19:00:00", "…
## $ reserve_datetime <chr> "2016-01-01 16:00:00", "2016-01-01 19:00:00", "…
## $ reserve_visitors <int> 1, 3, 6, 2, 5, 2, 4, 2, 2, 2, 3, 3, 2, 6, 7, 41…

The number of reservation is 92378, distributed for 314 air store.

air_reserve %>% dplyr::distinct(air_store_id) %>% nrow()
## [1] 314

2.3 HPG Reserve

summary(hpg_reserve)
##  hpg_store_id       visit_datetime     reserve_datetime  
##  Length:2000320     Length:2000320     Length:2000320    
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##  reserve_visitors 
##  Min.   :  1.000  
##  1st Qu.:  2.000  
##  Median :  3.000  
##  Mean   :  5.074  
##  3rd Qu.:  6.000  
##  Max.   :100.000
tibble::glimpse(hpg_reserve)
## Observations: 2,000,320
## Variables: 4
## $ hpg_store_id     <chr> "hpg_c63f6f42e088e50f", "hpg_dac72789163a3f47",…
## $ visit_datetime   <chr> "2016-01-01 11:00:00", "2016-01-01 13:00:00", "…
## $ reserve_datetime <chr> "2016-01-01 09:00:00", "2016-01-01 06:00:00", "…
## $ reserve_visitors <int> 1, 3, 2, 5, 13, 2, 2, 2, 2, 6, 2, 2, 2, 2, 5, 4…

There are 2000320 revervation for 13325 stores

hpg_reserve %>% dplyr::distinct(hpg_store_id) %>% nrow()
## [1] 13325

2.4 Air Store

summary(air_store_info)
##  air_store_id       air_genre_name     air_area_name         latitude    
##  Length:829         Length:829         Length:829         Min.   :33.21  
##  Class :character   Class :character   Class :character   1st Qu.:34.70  
##  Mode  :character   Mode  :character   Mode  :character   Median :35.66  
##                                                           Mean   :35.65  
##                                                           3rd Qu.:35.69  
##                                                           Max.   :44.02  
##    longitude    
##  Min.   :130.2  
##  1st Qu.:135.3  
##  Median :139.7  
##  Mean   :137.4  
##  3rd Qu.:139.8  
##  Max.   :144.3
tibble::glimpse(air_store_info)
## Observations: 829
## Variables: 5
## $ air_store_id   <chr> "air_0f0cdeee6c9bf3d7", "air_7cc17a324ae5c7dc", "…
## $ air_genre_name <chr> "Italian/French", "Italian/French", "Italian/Fren…
## $ air_area_name  <chr> "Hyōgo-ken Kōbe-shi Kumoidōri", "Hyōgo-ken Kōbe-s…
## $ latitude       <dbl> 34.69512, 34.69512, 34.69512, 34.69512, 35.65807,…
## $ longitude      <dbl> 135.1979, 135.1979, 135.1979, 135.1979, 139.7516,…

The number of air area

air_store_info %>% dplyr::distinct(air_area_name) %>% nrow()
## [1] 103

The number of stores

air_store_info %>% dplyr::distinct(air_store_id) %>% nrow()
## [1] 829

The number of latitude

air_store_info %>% dplyr::distinct(latitude) %>% nrow()
## [1] 108

The number of longitude

air_store_info %>% dplyr::distinct(longitude) %>% nrow()
## [1] 108

the number of cuisine genre in air

air_store_info %>% dplyr::distinct(air_genre_name) %>% nrow()
## [1] 14

2.5 HPG Stores

summary(hpg_store_info)
##  hpg_store_id       hpg_genre_name     hpg_area_name         latitude    
##  Length:4690        Length:4690        Length:4690        Min.   :33.31  
##  Class :character   Class :character   Class :character   1st Qu.:34.69  
##  Mode  :character   Mode  :character   Mode  :character   Median :35.66  
##                                                           Mean   :35.81  
##                                                           3rd Qu.:35.70  
##                                                           Max.   :43.77  
##    longitude    
##  Min.   :130.3  
##  1st Qu.:135.5  
##  Median :139.5  
##  Mean   :137.7  
##  3rd Qu.:139.7  
##  Max.   :143.7
tibble::glimpse(hpg_store_info)
## Observations: 4,690
## Variables: 5
## $ hpg_store_id   <chr> "hpg_6622b62385aec8bf", "hpg_e9e068dd49c5fa00", "…
## $ hpg_genre_name <chr> "Japanese style", "Japanese style", "Japanese sty…
## $ hpg_area_name  <chr> "Tōkyō-to Setagaya-ku Taishidō", "Tōkyō-to Setaga…
## $ latitude       <dbl> 35.64367, 35.64367, 35.64367, 35.64367, 35.64367,…
## $ longitude      <dbl> 139.6682, 139.6682, 139.6682, 139.6682, 139.6682,…

There are 4690 stores

hpg_store_info %>% dplyr::distinct(latitude) %>% nrow()
## [1] 129

There are 119 area name

hpg_store_info %>% dplyr::distinct(hpg_area_name) %>% nrow()
## [1] 119

There are 34 genres of cuisine in HPG

hpg_store_info %>% dplyr::distinct(hpg_genre_name) %>% nrow()
## [1] 34

2.6 Holidayes

glimpse(date_info)
## Observations: 517
## Variables: 3
## $ calendar_date <chr> "2016-01-01", "2016-01-02", "2016-01-03", "2016-01…
## $ day_of_week   <chr> "Friday", "Saturday", "Sunday", "Monday", "Tuesday…
## $ holiday_flg   <int> 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,…

2.7 Store IDs

glimpse(store_id_relation)
## Observations: 150
## Variables: 2
## $ air_store_id <chr> "air_63b13c56b7201bd9", "air_a24bf50c3e90d583", "ai…
## $ hpg_store_id <chr> "hpg_4bc649e72e2a239a", "hpg_c34b496d0305a809", "hp…

There are 150 relation between Air (829 stores) and HPG (4690 stores) stores.

2.8 Sample Submission

tibble::glimpse(sample_submission)
## Observations: 32,019
## Variables: 2
## $ id       <chr> "air_00a91d42b08b08d9_2017-04-23", "air_00a91d42b08b08d…
## $ visitors <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…

the submision file should be two columns. The first one is the cancatenation of store_id and date. the 2sd is the number of visitors.

2.9 Missing Values

purrr::map(ls(), is.na)
## [[1]]
## [1] FALSE
## 
## [[2]]
## [1] FALSE
## 
## [[3]]
## [1] FALSE
## 
## [[4]]
## [1] FALSE
## 
## [[5]]
## [1] FALSE
## 
## [[6]]
## [1] FALSE
## 
## [[7]]
## [1] FALSE
## 
## [[8]]
## [1] FALSE
## 
## [[9]]
## [1] FALSE

2.10 Reformating dates

air_visit <- air_visit %>% mutate(visit_date = ymd(visit_date))
air_reserve <- air_reserve %>% mutate(visit_datetime = ymd_hms(visit_datetime),
                                      reserve_datetime = ymd_hms(reserve_datetime))
hpg_reserve <- hpg_reserve %>% mutate(visit_datetime = ymd_hms(visit_datetime),
                                      reserve_datetime = ymd_hms(reserve_datetime))

air_store_info <- air_store_info %>% mutate(air_genre_name = as.factor(air_genre_name),
                                            air_area_name = as.factor(air_area_name))
hpg_store_info <- hpg_store_info %>% mutate(hpg_genre_name = as.factor(hpg_genre_name),
                                            hpg_area_name = as.factor(hpg_area_name))

date_info <- date_info %>% mutate(holiday_flg = as.logical(holiday_flg),
                                  date = ymd(calendar_date))

3 Individual feature visualisations

Here we have a first look at the distribution of the feature in our individual data files before combining them for a more detailed analysis.

3.1 Air Visits

 p1 <- air_visit %>% 
  group_by(visit_date) %>% 
  summarise(all_visitors = sum(visitors)) %>%
  ggplot(aes(visit_date, all_visitors)) +
  geom_line(col = "blue") +
  labs(y = "All visitors", x = "Date")

 p2 <- air_visit %>%
   ggplot(aes(visitors)) +
   geom_vline(xintercept = 20, color ="red") +
   geom_histogram(fill = "blue", bins = 30) +
   #xlim(0,150)
   scale_x_log10()
 
 p3 <- air_visit %>%
   mutate(wday = wday(visit_date, label = TRUE)) %>%
   group_by(wday) %>%
   summarise(visits = median(visitors)) %>%
   ggplot(aes(wday, visits, fill = wday)) +
   geom_col() +
   theme(legend.position = "none") +
   labs(x = "Day of the week", y = "Median Visitors")
 
 p4  <- air_visit %>%
   mutate(Month = month(visit_date, label = TRUE)) %>%
   group_by(Month) %>%
   summarise(visits = median(visitors)) %>%
   ggplot(aes(Month, visits, fill = Month)) +
   geom_col() +
   theme(legend.position = 'none') +
   labs(x = "MOnth", y = "Median Visitors")
 
 layout <- matrix(c(1,1,1,1,2,3,4,4), 2, 4, byrow = TRUE)
 multiplot(p1, p2, p3, p4, layout=layout)

We will be forecasting for the last week of April plus May 2017, so let’s look at this time range in our 2016 training data:

air_visit %>%
  filter(visit_date > ymd("2016-04-15") & visit_date < ymd("2016-06-15")) %>%
  group_by(visit_date) %>%
  summarise(all_visitors = sum(visitors)) %>%
  ggplot(aes(visit_date, all_visitors)) +
  geom_line() +
  geom_smooth(method = "loess", color = "blue", span = 1/10) +
  labs(y = "All visitors", x = "Date")

3.2 Air Reservations

foo <- air_reserve %>%
  mutate(reserve_date = date(reserve_datetime),
         reserve_hour = hour(reserve_datetime),
         reserve_wday = wday(reserve_datetime, label = TRUE),
         visit_date = date(visit_datetime),
         visit_hour = hour(visit_datetime),
         visit_wday = wday(visit_datetime, label = TRUE),
         diff_hour = time_length(visit_datetime - reserve_datetime, unit = "hour"),
         diff_day = time_length(visit_datetime - reserve_datetime, unit = "day")) 

  p1 <- foo %>%
  group_by(visit_date) %>%
  summarise(all_vitisors = sum(reserve_visitors)) %>%
  ggplot(aes(visit_date, all_vitisors)) +
  geom_line() +
  labs(x = "Air visit date")
  
  p2 <- foo %>%
    group_by(visit_hour) %>%
    summarise(all_visitors = sum(reserve_visitors)) %>%
    ggplot(aes(visit_hour, all_visitors)) +
    geom_col(fill = "blue") +
    labs(x = "Time from reservation to visit [hours]")
  
  p3 <- foo %>%
    filter(diff_hour < 24 *7) %>%
    group_by(diff_hour) %>%
    summarise(all_visitors = sum(reserve_visitors)) %>%
    ggplot(aes(diff_hour, all_visitors)) +
    geom_col(fill= "blue") +
    labs(x = "Time from reservation (hours)", y = "all_visitors")
  
  layout <- matrix(c(1, 1, 2, 3), 2, 2, byrow = TRUE)
  
  multiplot(p1, p2, p3, layout = layout)

3.3 HPG Reservation

too <- hpg_reserve %>%
  mutate(visit_date = date(visit_datetime),
         visit_hour = hour(visit_datetime),
         visit_wday = wday(visit_datetime),
         reserve_date = date(reserve_datetime),
         reserve_hour = hour(reserve_datetime),
         reserve_wday = wday(reserve_datetime),
         diff_hour = time_length(visit_datetime - reserve_datetime, unit = "hour"),
         diff_day = time_length(visit_datetime - reserve_datetime, unit = "day")
         )

p1 <- too %>%
  group_by(visit_date) %>%
  summarise(all_visitors = sum(reserve_visitors)) %>%
  ggplot(aes(visit_date, all_visitors)) +
  geom_line()

p2 <- too %>%
  group_by(visit_hour) %>%
  summarise(all_visitors = sum(reserve_visitors)) %>%
  ggplot(aes(visit_hour, all_visitors)) +
  geom_col(fill="red")

p3 <- too %>%
  group_by(diff_hour) %>%
  summarise(all_visitors = sum(reserve_visitors)) %>%
  ggplot(aes(diff_hour, all_visitors)) +
  geom_col(fill = "red") +
  xlim(0,150)

 layout <- matrix(c(1,1,2,3), 2, 2, byrow = TRUE)
 multiplot(p1, p2, p3, layout = layout)
## Warning: Removed 1978 rows containing missing values (position_stack).
## Warning: Removed 2 rows containing missing values (geom_col).

3.4 Air Store

leaflet(air_store_info) %>%
  addTiles() %>%
  addProviderTiles("CartoDB.Positron") %>%
  addMarkers(~longitude, ~latitude,
             popup = ~air_store_id, label = ~air_genre_name,
             clusterOptions = markerClusterOptions()
             )
p1 <- air_store_info %>%
  group_by(air_area_name) %>%
  count() %>%
  ungroup() %>%
  top_n(15,n) %>%
  ggplot(aes(reorder(air_area_name, n, FUN = min), n, fill = air_area_name )) +
  geom_col()+
  coord_flip()+
  theme(legend.position = "none") +
  labs(x= "Top 15 area (air_aera_name)", y ="Number of air restaurants")


p2 <- air_store_info %>%
  group_by(air_genre_name) %>%
  count() %>%
  #ungroup() %>%
  top_n(15, n) %>%
  ggplot(aes(reorder(air_genre_name, n), n, fill = air_genre_name)) +
  geom_col() +
  coord_flip()+
  theme(legend.position = "none")

layout <- matrix(c(1,2), 2,1, byrow = TRUE)
multiplot(p1,p2, layout = layout)

3.5 HPG Store

leaflet(hpg_store_info) %>%
  addTiles() %>%
  addProviderTiles("cartoDB.positron") %>%
  addMarkers(~longitude, ~latitude,
             popup = ~hpg_store_id, label = ~hpg_genre_name,
             clusterOptions = markerClusterOptions()
             )
p1 <- hpg_store_info %>%
  group_by(hpg_genre_name) %>% 
  count() %>%
  ggplot(aes(reorder(hpg_genre_name, n, FUN = min), n, fill = hpg_genre_name)) +
  geom_col() +
  coord_flip() +
  theme(legend.position = "none") +
  labs(x = "Type of cuisine (hpg_genre_name)", y = "Number of hpg restaurants")

p2 <- hpg_store_info %>%
  group_by(hpg_area_name) %>%
  count() %>%
  arrange(desc(n)) %>%
  head(15) %>%
  ggplot(aes(reorder(x = hpg_area_name, n, FUN = min), n, fill = hpg_area_name)) +
  geom_col() +
  coord_flip() +
  theme(legend.position = "none") +
  labs(x = "Top 15 areas (hpg_area_name)", y = "Number of hpg restaurants")

layout <- matrix(c(1,2), 1,2, byrow=TRUE)
multiplot(p1, p2, layout = layout)

3.6 Holidays

date_info %>%
  group_by(holiday_flg) %>%
  count() %>%
  ggplot(aes(x = holiday_flg, y = n, fill = holiday_flg)) +
  geom_col() +
  theme(legend.position = "none")

date_info %>%
  filter(date  > ymd("2017-04-15") & date < ymd("2017-06-01")) %>%
ggplot(aes(x = date, y= holiday_flg, color = holiday_flg)) +
geom_point() +
  theme(legend.position = "none")

3.7 Test Data set

air_visit_train <- air_visit %>%
  rename(date = visit_date) %>%
  distinct(date) %>%
  mutate(dset= "train")

bar <- sample_submission %>%
  tidyr::separate(id, c("foo","bar", "date"), sep = "_") %>%
  mutate(date = ymd(date)) %>%
  distinct(date) %>%
  mutate(dset = "test")

air_visit_train <- air_visit_train %>%
  bind_rows(bar)

year(air_visit_train$date) <- 2017

air_visit_train %>%
  filter(!is.na(date)) 
## # A tibble: 516 x 2
##    date       dset 
##    <date>     <chr>
##  1 2017-01-13 train
##  2 2017-01-14 train
##  3 2017-01-15 train
##  4 2017-01-16 train
##  5 2017-01-18 train
##  6 2017-01-19 train
##  7 2017-01-20 train
##  8 2017-01-21 train
##  9 2017-01-22 train
## 10 2017-01-23 train
## # … with 506 more rows
  #mutate(year = forcats::fct_relevel(as.factor(year), c("2017", "2016")))

4 Feature relations

4.1 Visitors per genre

air_visit %>%
left_join(air_store_info, by = "air_store_id") %>%
  group_by(visit_date, air_genre_name) %>%
  summarise(mean_visitors = mean(visitors)) %>%
  ungroup() %>%
  ggplot(aes(visit_date, mean_visitors, color = air_genre_name)) +
  geom_line() +
  labs(y = "Average number of visitors to 'air' restaurants", x = "Date") +
  theme(legend.position = "none") +
  scale_y_log10() +
  facet_wrap(~ air_genre_name)

air_visit %>%
left_join(air_store_info, by = "air_store_id") %>%
  mutate(wday = wday(visit_date, label = TRUE)) %>%
  group_by(wday, air_genre_name) %>%
 summarise(mean_visitors = mean(visitors)) %>%
  ggplot(aes(y = mean_visitors, x= air_genre_name, color = wday)) +
  geom_point() +
  theme(legend.position = "left", axis.text.y = element_blank(),
        plot.title = element_text(size = 14)) +
  coord_flip() +
  scale_color_hue()

air_visit %>%
left_join(air_store_info, by = "air_store_id") %>%
  group_by(air_genre_name) %>%
  ggplot(aes(x = visitors, y = air_genre_name)) +
  ggridges::geom_density_ridges(bandwidth = 0.1) +
  scale_x_log10() +
  ggridges::scale_fill_cyclical(values = c("blue", "red"))

4.2 The impact of holidays

foo <- air_visit %>%
  #group_by(visit_date) %>%
  mutate(calendar_date = as.character(visit_date)) %>%
  left_join(date_info, by="calendar_date")

foo %>%
  ggplot(aes(holiday_flg, visitors, color = holiday_flg)) +
  geom_boxplot() +
  scale_y_log10() +
  theme(legend.position = "none")

foo %>%
  mutate(wday = wday(date, label = TRUE)) %>%
  group_by(wday, holiday_flg) %>%
  summarise(visitors_mean = mean(visitors)) %>%
  ggplot(aes(x= wday, visitors_mean, color = holiday_flg)) +
  geom_point()

4.3 Restaurants per aera and the effect on visitor numbers

air_store_info %>%
  #mutate(air_area_name = str_sub(air_area_name, 1, 12))
  group_by(air_area_name) %>%
  count(air_genre_name) %>%
  head(50) %>%
  ggplot(aes(x = air_area_name, y = air_genre_name)) +
  geom_point(aes(size = n)) +
  theme(axis.text.x = element_text(angle=45, hjust = 1, vjust= 0.9))

hpg_store_info %>%
    mutate(area = str_sub(hpg_area_name, 1, 10)) %>%
  group_by(area) %>%
  count(hpg_genre_name) %>%
  head(100) %>%
  ggplot(aes(x= area, y = hpg_genre_name )) +
  geom_point(aes(size = n)) +
  theme(legend.position = "bottom",axis.text.x = element_text(angle=45, hjust = 1, vjust= 0.9))

air_store_info %>%
  group_by(air_genre_name) %>%
  count(air_area_name) %>%
  ggplot(aes(reorder(x= air_genre_name, n, FUN = mean ), y = n)) +
  scale_y_log10() +
  geom_boxplot() +
  geom_jitter(color ="blue") +
  coord_flip()

foo <- air_visit %>%
  left_join(air_store_info, by = "air_store_id")

bar <- air_store_info %>%
  group_by(air_genre_name, air_area_name) %>%
  count()

foobar <- hpg_store_info %>%
  group_by(hpg_genre_name, hpg_area_name) %>%
  count()

p1 <- bar %>%
  ggplot(aes(n)) +
  geom_histogram(fill = "blue", binwidth = 1) +
  labs(x = "Air genres per aera")

p2 <- foobar %>%
  ggplot(aes(n)) +
  geom_histogram(fill = "red", binwidth = 1) +
  labs(x = "HPG genres per area")


P3 <- foo %>%
  group_by(air_genre_name, air_area_name) %>%
  summarise(mean_log_visit = mean(log1p(visitors))) %>%
  left_join(bar, by = c("air_genre_name","air_area_name")) %>%
  group_by(n) %>%
  summarise(mean_mlv = mean(mean_log_visit),
          sd_mlv = sd(mean_log_visit)) %>%
  tidyr::replace_na(list(sd_mlv = 0)) %>%
  ggplot(aes(n, mean_mlv)) +
  geom_point(color = "blue", size = 4) +
  geom_errorbar(aes(ymin = mean_mlv - sd_mlv, ymax = mean_mlv + sd_mlv), width = 0.5, size = 0.7, color = "blue") +
  labs(x="Case of identifcation Air genres per area", y = "Mean +/- SD of \n mean Logp1 visitors")

4.4 Reservations vs Visits

foo <- air_reserve %>%
  mutate(visit_date = date(visit_datetime)) %>%
  group_by(air_store_id, visit_date) %>%
  summarise(reserve_visitors_air = sum(reserve_visitors))


  
  bar <- hpg_reserve %>%
    mutate(visit_date = date(visit_datetime)) %>%
    group_by(hpg_store_id, visit_date) %>%
    summarise(reserve_visitors_hpg = sum(reserve_visitors)) %>%
    inner_join(store_id_relation, by = "hpg_store_id")
  
  all_reserve <- air_visit %>%
    inner_join(foo, by = c("air_store_id", "visit_date")) %>%
    inner_join(bar, by = c("air_store_id", "visit_date")) %>%
    mutate(reserve_visitors = reserve_visitors_air + reserve_visitors_hpg)
  
  p <- all_reserve %>%
    filter(reserve_visitors < 120 ) %>%
    ggplot(aes(reserve_visitors, visitors )) +
    geom_point(color = "black", alpha = 0.5) +
    geom_abline(slope = 1, intercept = 0, color = "grey60") +
    geom_smooth(method = "lm", color = "blue") 
  
  ggMarginal(p,  type = "histogram", fill = "blue", bins=50)

p1 <- all_reserve %>%
  ggplot(aes(visitors - reserve_visitors)) +
  geom_histogram(binwith = 5 , fill = "black") +
  coord_flip() +
  labs(x = "")
## Warning: Ignoring unknown parameters: binwith
p2 <- all_reserve %>%
  ggplot(aes(visitors - reserve_visitors_air)) +
  geom_histogram(binwidth = 5, fill = "blue") +
  coord_flip() +
  labs(x = "")

p3 <- all_reserve %>%
  ggplot(aes(visitors - reserve_visitors_hpg)) +
  geom_histogram(binwidth = 5, fill = "red") +
  coord_flip() +
  labs(x = "")

p4 <- all_reserve %>%
  ggplot(aes(visit_date, visitors - reserve_visitors)) +
  geom_hline(yintercept = c(150, 0, -250)) +
  geom_line() +
  geom_line(aes(visit_date, visitors - reserve_visitors_air +150), color = "blue") +
  geom_line(aes(visit_date, visitors - reserve_visitors_hpg - 250), color = "red") +
  ggtitle("Visitors - Reserved: all (black), air (blue), hpg (red)")


layout <- matrix(c(4,4,2,4,4,1,4,4,3), 3, 3, byrow = TRUE)
multiplot(p1, p2, p3, p4, layout = layout)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

all_reserve %>%
  mutate(date = visit_date) %>%
  left_join(date_info, by = "date") %>%
  ggplot(aes(visitors - reserve_visitors, fill = holiday_flg)) +
  geom_density(alpha = 0.5)

5 Feature engineering

air_visit <- air_visit %>%
  mutate(wday = wday(visit_date, label = TRUE),
         wday = forcats::fct_relevel(wday, c("Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun")), month = month(visit_date, label = TRUE))

air_reserve <- air_reserve %>%
  mutate(reserve_date = date(reserve_datetime),
         reserve_hour = hour(reserve_datetime),
         reserve_wday = wday(reserve_datetime, label = TRUE),
         reserve_wday = forcats::fct_relevel(reserve_wday, c("Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun")),
         visit_date = date(visit_datetime),
         visit_hour = hour(visit_datetime),
         visit_wday = wday(visit_datetime, label = TRUE),
         viist_wday = forcats::fct_relevel(visit_wday, c("Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun")),
         diff_hour = time_length(visit_datetime - reserve_datetime, unit = "hour"),
         diff_day = time_length(visit_datetime - reserve_datetime, unit = "day"))


hpg_reserve %>%
  mutate(reserve_date = date(reserve_datetime),
         reserve_hour = hour(reserve_datetime),
         reserve_wday = wday(reserve_datetime, label = TRUE),
         reserve_wday = forcats::fct_relevel(reserve_wday, c("Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun")),
         visit_date = date(visit_datetime),
         visit_hour = hour(visit_datetime),
         visit_wday = wday(visit_datetime, label = TRUE),
         visit_wday = forcats::fct_relevel(visit_wday, c("Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun")),
         diff_hour = time_length(visit_datetime - reserve_datetime, unit = "hour"),
         diff_day = time_length(visit_datetime - reserve_datetime, unit = "day"))
## # A tibble: 2,000,320 x 12
##    hpg_store_id visit_datetime      reserve_datetime    reserve_visitors
##    <chr>        <dttm>              <dttm>                         <int>
##  1 hpg_c63f6f4… 2016-01-01 11:00:00 2016-01-01 09:00:00                1
##  2 hpg_dac7278… 2016-01-01 13:00:00 2016-01-01 06:00:00                3
##  3 hpg_c8e24dc… 2016-01-01 16:00:00 2016-01-01 14:00:00                2
##  4 hpg_24bb207… 2016-01-01 17:00:00 2016-01-01 11:00:00                5
##  5 hpg_25291c5… 2016-01-01 17:00:00 2016-01-01 03:00:00               13
##  6 hpg_28bdf7a… 2016-01-01 17:00:00 2016-01-01 15:00:00                2
##  7 hpg_2a01a04… 2016-01-01 17:00:00 2016-01-01 17:00:00                2
##  8 hpg_2a84dd9… 2016-01-01 17:00:00 2016-01-01 15:00:00                2
##  9 hpg_2ad1798… 2016-01-01 17:00:00 2016-01-01 13:00:00                2
## 10 hpg_2c1d989… 2016-01-01 17:00:00 2016-01-01 15:00:00                6
## # … with 2,000,310 more rows, and 8 more variables: reserve_date <date>,
## #   reserve_hour <int>, reserve_wday <ord>, visit_date <date>,
## #   visit_hour <int>, visit_wday <ord>, diff_hour <dbl>, diff_day <dbl>
# count stores in area
air_count <- air_store_info %>%
  group_by(air_area_name) %>%
  summarise(air_count = n())

hpg_count <- hpg_store_info %>%
  group_by(hpg_area_name) %>%
  summarise(hpg_count = n())

# distance
med_coord_air <- air_store_info %>%
  summarise_at(vars(longitude:latitude), median)

med_coord_hpg <- hpg_store_info %>%
  summarise_at(vars(longitude:latitude), median)



air_coords <- air_store_info %>%
  select(longitude, latitude)

hpg_coords <- hpg_store_info %>%
  select(longitude, latitude)

air_store_info$dist <- geosphere::distCosine(air_coords, med_coord_air) / 1e3
hpg_store_info$dist <- geosphere::distCosine(hpg_coords, med_coord_hpg) / 1e3

# apply counts, dist; add prefecture

air_store_info %>%
  mutate(dist_group = as.integer(case_when(
    dist < 80  ~ 1,
    dist < 300 ~ 2, 
    dist < 500 ~ 3,
    dist < 750 ~ 4,
    TRUE ~ 5 ))) %>%
  left_join(air_count, by = "air_area_name") %>%
  separate(air_area_name, c("preferture"), sep = " ", remove = FALSE)
## Warning: Expected 1 pieces. Additional pieces discarded in 829 rows [1, 2,
## 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].
## # A tibble: 829 x 9
##    air_store_id air_genre_name air_area_name preferture latitude longitude
##    <chr>        <fct>          <fct>         <chr>         <dbl>     <dbl>
##  1 air_0f0cdee… Italian/French Hyōgo-ken Kō… Hyōgo-ken      34.7      135.
##  2 air_7cc17a3… Italian/French Hyōgo-ken Kō… Hyōgo-ken      34.7      135.
##  3 air_fee8dcf… Italian/French Hyōgo-ken Kō… Hyōgo-ken      34.7      135.
##  4 air_a17f077… Italian/French Hyōgo-ken Kō… Hyōgo-ken      34.7      135.
##  5 air_83db5af… Italian/French Tōkyō-to Min… Tōkyō-to       35.7      140.
##  6 air_99c3eae… Italian/French Tōkyō-to Min… Tōkyō-to       35.7      140.
##  7 air_f183a51… Italian/French Tōkyō-to Min… Tōkyō-to       35.7      140.
##  8 air_6b9fa44… Italian/French Tōkyō-to Min… Tōkyō-to       35.7      140.
##  9 air_0919d54… Italian/French Tōkyō-to Min… Tōkyō-to       35.7      140.
## 10 air_2c6c79d… Italian/French Tōkyō-to Min… Tōkyō-to       35.7      140.
## # … with 819 more rows, and 3 more variables: dist <dbl>,
## #   dist_group <int>, air_count <int>

5.1 Days of the week & months of the year

air_visit %>%
  group_by(wday) %>%
  summarise(mean_log_visitors = mean(log1p(visitors)),
            sd_log_visitors = sd(log1p(visitors))) %>%
  ggplot(aes(wday, mean_log_visitors, color = wday)) +
  geom_point(size = 4) +
  geom_errorbar(aes(ymin = mean_log_visitors - sd_log_visitors,
                    ymax = mean_log_visitors + sd_log_visitors,
                    color = wday), width = 0.5, size = 0.7 ) +
  theme(legend.position = "none")

p2 <- air_visit %>%
  mutate(visitors = log1p(visitors)) %>%
  ggplot(aes(visitors, wday, fill = wday)) +
  ggridges::geom_density_ridges(bandwidth = 0.1) +
  scale_x_log10() +
  theme(legend.position = "none") +
  labs(x= "log1P(visitors", y ="")

p3 <- air_visit %>%
  group_by(month) %>%
  summarise(mean_log_visitors = mean(log1p(visitors)),
            sd_log_visitors = sd(log1p(visitors))) %>%
  ggplot(aes(month, mean_log_visitors, color = month)) +
  geom_point(size = 4) +
  geom_errorbar(aes(ymin = mean_log_visitors - sd_log_visitors,
                    ymax = mean_log_visitors + sd_log_visitors,
                    color = month), width = 0.5, size = 0.7) +
  theme(legend.position = "none")

p4 <- air_visit %>%
  mutate(visitors = log1p(visitors)) %>%
  ggplot(aes(visitors, month, fill = month)) +
  geom_density_ridges(bandwidth = 0.1) +
  scale_x_log10() +
  theme(legend.position = "none") +
  labs(x = "log1p(visitors)", y = "")

layout <- matrix(c(1,2,3,4),2,2,byrow=TRUE)

multiplot(p1, p2, p3, p4, layout=layout)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

6 Time series parameters

air_visit %>%
  left_join(air_store_info, by = "air_store_id") %>%
  group_by(air_store_id, air_genre_name) %>%
  summarise(mean_log_visits = mean(log1p(visitors)),
            mean_log_visits = mean(log1p(visitors)),
            sd_log_visits = sd(log1p(visitors))) %>%
  ungroup()
## # A tibble: 829 x 4
##    air_store_id         air_genre_name mean_log_visits sd_log_visits
##    <chr>                <fct>                    <dbl>         <dbl>
##  1 air_00a91d42b08b08d9 Italian/French            3.17         0.577
##  2 air_0164b9927d20bcc3 Italian/French            2.12         0.685
##  3 air_0241aa3964b7f861 Izakaya                   2.24         0.565
##  4 air_0328696196e46f18 Dining bar                1.97         0.663
##  5 air_034a3d5b40d5b1b1 Cafe/Sweets               2.46         0.824
##  6 air_036d4f1ee7285390 Cafe/Sweets               3.02         0.535
##  7 air_0382c794b73b51ad Cafe/Sweets               3.01         0.767
##  8 air_03963426c9312048 Izakaya                   3.47         0.762
##  9 air_04341b588bde96cd Izakaya                   3.49         0.543
## 10 air_049f6d5b402a31b2 Japanese food             2.37         0.609
## # … with 819 more rows
params_ts1 <- function(rownr){
  
  bar <- air_visit %>%
    filter(air_store_id == foo$air_store_id[rownr])
  slope <- summary(lm(visitors ~ visit_date, data = bar))$coef[2]
  slope_err <- summary(lm(visitors ~ visit_date, data = bar))$coef[4]
 
  foobar <- tibble(
    air_store_id = foo$air_store_id[rownr],
    slope = slope,
    slope_err = slope_err
  )
  return(foobar)

}

params <- params_ts1(1)

for (i in seq(2,nrow(foo))){

  params <- bind_rows(params, params_ts1(i))

}



ts_params <- foo %>%
  left_join(params, by = "air_store_id")